
package org.apache.solr.update.processor;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Locale;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.AddUpdateCommand;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Update processor which examines a URL and outputs to various other fields
 * characteristics of that URL, including length, number of path levels, whether
 * it is a top level URL (levels==0), whether it looks like a landing/index page,
 * a canonical representation of the URL (e.g. stripping index.html), the domain
 * and path parts of the URL etc.
 * <p>
 * This processor is intended used in connection with processing web resources,
 * and helping to produce values which may be used for boosting or filtering later.
 */
public class URLClassifyProcessor extends UpdateRequestProcessor {

    private static final String INPUT_FIELD_PARAM = "inputField";
    private static final String OUTPUT_LENGTH_FIELD_PARAM = "lengthOutputField";
    private static final String OUTPUT_LEVELS_FIELD_PARAM = "levelsOutputField";
    private static final String OUTPUT_TOPLEVEL_FIELD_PARAM = "toplevelOutputField";
    private static final String OUTPUT_LANDINGPAGE_FIELD_PARAM = "landingpageOutputField";
    private static final String OUTPUT_DOMAIN_FIELD_PARAM = "domainOutputField";
    private static final String OUTPUT_CANONICALURL_FIELD_PARAM = "canonicalUrlOutputField";
    private static final String DEFAULT_URL_FIELDNAME = "url";
    private static final String DEFAULT_LENGTH_FIELDNAME = "url_length";
    private static final String DEFAULT_LEVELS_FIELDNAME = "url_levels";
    private static final String DEFAULT_TOPLEVEL_FIELDNAME = "url_toplevel";
    private static final String DEFAULT_LANDINGPAGE_FIELDNAME = "url_landingpage";
    private final static Logger log = LoggerFactory.getLogger(URLClassifyProcessor.class);
    private boolean enabled = true;
    private String urlFieldname = DEFAULT_URL_FIELDNAME;
    private String lengthFieldname = DEFAULT_LENGTH_FIELDNAME;
    private String levelsFieldname = DEFAULT_LEVELS_FIELDNAME;
    private String toplevelpageFieldname = DEFAULT_TOPLEVEL_FIELDNAME;
    private String landingpageFieldname = DEFAULT_LANDINGPAGE_FIELDNAME;
    private String domainFieldname = null;
    private String canonicalUrlFieldname = null;
    private static final String[] landingPageSuffixes = {
        "/",
        "index.html",
        "index.htm",
        "index.phtml",
        "index.shtml",
        "index.xml",
        "index.php",
        "index.asp",
        "index.aspx",
        "welcome.html",
        "welcome.htm",
        "welcome.phtml",
        "welcome.shtml",
        "welcome.xml",
        "welcome.php",
        "welcome.asp",
        "welcome.aspx"
    };

    public URLClassifyProcessor(SolrParams parameters, SolrQueryRequest request, SolrQueryResponse response, UpdateRequestProcessor nextProcessor) {
        super(nextProcessor);

        this.initParameters(parameters);
    }

    private void initParameters(SolrParams parameters) {

        if (parameters != null) {
            this.setEnabled(parameters.getBool("enabled", true));
            this.urlFieldname = parameters.get(INPUT_FIELD_PARAM, DEFAULT_URL_FIELDNAME);
            this.lengthFieldname = parameters.get(OUTPUT_LENGTH_FIELD_PARAM, DEFAULT_LENGTH_FIELDNAME);
            this.levelsFieldname = parameters.get(OUTPUT_LEVELS_FIELD_PARAM, DEFAULT_LEVELS_FIELDNAME);
            this.toplevelpageFieldname = parameters.get(OUTPUT_TOPLEVEL_FIELD_PARAM, DEFAULT_TOPLEVEL_FIELDNAME);
            this.landingpageFieldname = parameters.get(OUTPUT_LANDINGPAGE_FIELD_PARAM, DEFAULT_LANDINGPAGE_FIELDNAME);
            this.domainFieldname = parameters.get(OUTPUT_DOMAIN_FIELD_PARAM);
            this.canonicalUrlFieldname = parameters.get(OUTPUT_CANONICALURL_FIELD_PARAM);
        }
    }

    @Override
    public void processAdd(AddUpdateCommand command) throws IOException {

        if (isEnabled()) {
            SolrInputDocument document = command.getSolrInputDocument();
            if (document.containsKey(urlFieldname)) {
                String url = (String) document.getFieldValue(urlFieldname);
                try {
                    URL normalizedURL = getNormalizedURL(url);
                    document.setField(lengthFieldname, length(normalizedURL));
                    document.setField(levelsFieldname, levels(normalizedURL));
                    document.setField(toplevelpageFieldname, isTopLevelPage(normalizedURL) ? 1 : 0);
                    document.setField(landingpageFieldname, isLandingPage(normalizedURL) ? 1 : 0);
                    if (domainFieldname != null) {
                        document.setField(domainFieldname, normalizedURL.getHost());
                    }
                    if (canonicalUrlFieldname != null) {
                        document.setField(canonicalUrlFieldname, getCanonicalUrl(normalizedURL));
                    }
                    log.debug(document.toString());
                }
                catch (MalformedURLException | URISyntaxException e) {
                    log.warn("cannot get the normalized url for \"" + url + "\" due to " + e.getMessage());
                }
            }
        }
        super.processAdd(command);
    }

    /**
     * Gets a canonical form of the URL for use as main URL
     *
     * @param url The input url
     * @return The URL object representing the canonical URL
     */
    public URL getCanonicalUrl(URL url) {

        // NOTE: Do we want to make sure this URL is normalized? (Christian thinks we should)
        String urlString = url.toString();
        try {
            String lps = landingPageSuffix(url);
            return new URL(urlString.replaceFirst("/" + lps + "$", "/"));
        }
        catch (MalformedURLException e) { }

        return url;
    }

    /**
     * Calculates the length of the URL in characters
     *
     * @param url The input URL
     * @return the length of the URL
     */
    public int length(URL url) {
        return url.toString().length();
    }

    /**
     * Calculates the number of path levels in the given URL
     *
     * @param url The input URL
     * @return the number of levels, where a top-level URL is 0
     */
    public int levels(URL url) {

        // Remove any trailing slashes for the purpose of level counting
        String path = getPathWithoutSuffix(url).replaceAll("/+$", "");
        int levels = 0;
        for (int i = 0; i < path.length(); i++) {
            if (path.charAt(i) == '/') {
                levels++;
            }
        }
        return levels;
    }

    /**
     * Calculates whether a URL is a top level page
     *
     * @param url The input URL
     * @return true if page is a top level page
     */
    public boolean isTopLevelPage(URL url) {
        // Remove any trailing slashes for the purpose of level counting
        String path = getPathWithoutSuffix(url).replaceAll("/+$", "");
        return path.length() == 0 && url.getQuery() == null;
    }

    /**
     * Calculates whether the URL is a landing page or not
     *
     * @param url The input URL
     * @return true if URL represents a landing page (index page)
     */
    public boolean isLandingPage(URL url) {

        if (url.getQuery() != null) {
            return false;
        }
        else {
            return !"".equals(landingPageSuffix(url));
        }
    }

    public URL getNormalizedURL(String url) throws MalformedURLException, URISyntaxException {
        return new URI(url).normalize().toURL();
    }

    public boolean isEnabled() {
        return enabled;
    }

    public void setEnabled(boolean enabled) {
        this.enabled = enabled;
    }

    private String landingPageSuffix(URL url) {
        String path = url.getPath().toLowerCase(Locale.ROOT);
        for (String suffix : landingPageSuffixes) {
            if (path.endsWith(suffix)) {
                return suffix;
            }
        }
        return "";
    }

    private String getPathWithoutSuffix(URL url) {
        return url.getPath().toLowerCase(Locale.ROOT).replaceFirst(landingPageSuffix(url) + "$", "");
    }
}
